Paul Trowbridge
May 2, 2017
update_data <- function() {
# get list of datasets available
dlist <- RSocrata::ls.socrata("https://nycopendata.socrata.com/")
# create empty data frame to store data
ddf <- data.frame(name=character(),
attribution=character(),
downloadcount=numeric(),
viewcount=numeric(),
category=character(),
createdAt=numeric(),
indexUpdatedAt=numeric(),
description=character(),
stringsAsFactors=FALSE
)
# loop over list of datasets and extract data
for(i in 1:dim(dlist)[1]){
# get name of json file
fn <- paste(dlist[i,9],".json",sep="")
# get json file
dd <- jsonlite::fromJSON(txt=fn)
# extract data to be analyzed
ddf[i,1] <- ifelse(is.null(dd$name),NA,dd$name)
ddf[i,2] <- ifelse(is.null(dd$attribution),NA,dd$attribution)
ddf[i,3] <- ifelse(is.null(dd$downloadCount),NA,dd$downloadCount)
ddf[i,4] <- ifelse(is.null(dd$viewCount),NA,dd$viewCount)
ddf[i,5] <- ifelse(is.null(dd$category),NA,dd$category)
ddf[i,6] <- ifelse(is.null(dd$createdAt),NA,dd$createdAt)
ddf[i,7] <- ifelse(is.null(dd$indexUpdatedAt),NA,dd$indexUpdatedAt)
ddf[i,8] <- ifelse(is.null(dd$description),NA,dd$description)
}
return(ddf)
}
We can test these ideas formally with a regression model. But fitting the category name as a factor-level covariate we can identify categories of datasets that are statistically relevant and also whether they have a statistically higher download count or view count.
Compute the ratio of downloads to views as:
rdv <- dat[,3]/dat[,4]
lm.fit <- lm(log(rdv+.001)~dat$category-1)
(summary(lm.fit))
##
## Call:
## lm(formula = log(rdv + 0.001) ~ dat$category - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7812 -0.5486 0.1430 0.7005 4.1462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## dat$categoryBusiness 0.01621 0.14770 0.110 0.91264
## dat$categoryCity Government 0.25565 0.05582 4.580 5.03e-06 ***
## dat$categoryEducation 0.60415 0.07038 8.584 < 2e-16 ***
## dat$categoryEnvironment -0.15303 0.11264 -1.359 0.17447
## dat$categoryHealth -0.26550 0.20027 -1.326 0.18514
## dat$categoryHousing & Development 0.25240 0.08681 2.908 0.00369 **
## dat$categoryNYC BigApps -0.63070 0.15174 -4.156 3.41e-05 ***
## dat$categoryPublic Safety 0.87349 0.08266 10.568 < 2e-16 ***
## dat$categoryRecreation -0.28128 0.13051 -2.155 0.03130 *
## dat$categorySocial Services 0.51294 0.10490 4.890 1.11e-06 ***
## dat$categoryTransportation -0.60722 0.09325 -6.512 9.99e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.115 on 1555 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.1647, Adjusted R-squared: 0.1588
## F-statistic: 27.88 on 11 and 1555 DF, p-value: < 2.2e-16
We can, in a sense, “invert” our problem. Instead of treating the download and view counts as a random response, and analyzing how they vary as a function of Category, we can ask, given those counts, how well can we classify the Category of a dataset.
We’ll evaluate the performance of our classifier by out-of-sample predictive performance.
# add age to dataset
age <- as.numeric(difftime(Sys.Date(),as.Date(as.POSIXct(dat[,6], origin="1970-01-01")),units="days"))
dat <- cbind(dat,age)
train <- rbinom(dim(dat)[1],1,.70)
mean(train)
## [1] 0.7164272
test <- dat[!(as.logical(train)),]
qda.fit <- MASS::qda(category~downloadcount+viewcount+age,data=dat,subset=as.logical(train))
qda.pred <- predict(qda.fit,test)
qda.class <- qda.pred$class
(mean(qda.class==test$category,na.rm=TRUE))
## [1] 0.2031603
(prec <- diag(cm)/rowSums(cm))
## Business City Government Education
## 1.00000000 0.20000000 0.18154762
## Environment Health Housing & Development
## 0.08333333 NaN 0.14285714
## NYC BigApps Public Safety Recreation
## 0.85000000 1.00000000 0.00000000
## Social Services Transportation
## 0.00000000 0.25000000
(recall <- diag(cm)/colSums(cm))
## Business City Government Education
## 0.05000000 0.01869159 0.91044776
## Environment Health Housing & Development
## 0.13333333 0.00000000 0.01587302
## NYC BigApps Public Safety Recreation
## 0.80952381 0.02272727 0.00000000
## Social Services Transportation
## 0.00000000 0.07142857